In [181]:
#import the libraries

import numpy as np
import pandas as pd 
import seaborn as sns
%matplotlib inline
import matplotlib.pylab as plt

from sklearn.model_selection  import train_test_split
from sklearn.cluster import KMeans

from scipy.stats import zscore
from scipy.stats import iqr
import sklearn.metrics as metrics

import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
In [136]:
#Read the data set

vehicle=pd.read_csv('vehicle.csv', sep = ',')
In [137]:
vehicle
Out[137]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
841 93 39.0 87.0 183.0 64.0 8 169.0 40.0 20.0 134 200.0 422.0 149.0 72.0 7.0 25.0 188.0 195 car
842 89 46.0 84.0 163.0 66.0 11 159.0 43.0 20.0 159 173.0 368.0 176.0 72.0 1.0 20.0 186.0 197 van
843 106 54.0 101.0 222.0 67.0 12 222.0 30.0 25.0 173 228.0 721.0 200.0 70.0 3.0 4.0 187.0 201 car
844 86 36.0 78.0 146.0 58.0 7 135.0 50.0 18.0 124 155.0 270.0 148.0 66.0 0.0 25.0 190.0 195 car
845 85 36.0 66.0 123.0 55.0 5 120.0 56.0 17.0 128 140.0 212.0 131.0 73.0 1.0 18.0 186.0 190 van

846 rows × 19 columns

In [138]:
vehicle.describe(include="all").T
Out[138]:
count unique top freq mean std min 25% 50% 75% max
compactness 846 NaN NaN NaN 93.6785 8.23447 73 87 93 100 119
circularity 841 NaN NaN NaN 44.8288 6.15217 33 40 44 49 59
distance_circularity 842 NaN NaN NaN 82.1105 15.7783 40 70 80 98 112
radius_ratio 840 NaN NaN NaN 168.888 33.5202 104 141 167 195 333
pr.axis_aspect_ratio 844 NaN NaN NaN 61.6789 7.89146 47 57 61 65 138
max.length_aspect_ratio 846 NaN NaN NaN 8.56738 4.60122 2 7 8 10 55
scatter_ratio 845 NaN NaN NaN 168.902 33.2148 112 147 157 198 265
elongatedness 845 NaN NaN NaN 40.9337 7.81619 26 33 43 46 61
pr.axis_rectangularity 843 NaN NaN NaN 20.5824 2.59293 17 19 20 23 29
max.length_rectangularity 846 NaN NaN NaN 147.999 14.5157 118 137 146 159 188
scaled_variance 843 NaN NaN NaN 188.631 31.411 130 167 179 217 320
scaled_variance.1 844 NaN NaN NaN 439.494 176.667 184 318 363.5 587 1018
scaled_radius_of_gyration 844 NaN NaN NaN 174.71 32.5848 109 149 173.5 198 268
scaled_radius_of_gyration.1 842 NaN NaN NaN 72.4477 7.48619 59 67 71.5 75 135
skewness_about 840 NaN NaN NaN 6.36429 4.92065 0 2 6 9 22
skewness_about.1 845 NaN NaN NaN 12.6024 8.93608 0 5 11 19 41
skewness_about.2 845 NaN NaN NaN 188.92 6.15581 176 184 188 193 206
hollows_ratio 846 NaN NaN NaN 195.632 7.4388 181 190.25 197 201 211
class 846 3 car 429 NaN NaN NaN NaN NaN NaN NaN
In [139]:
#Compactness has mean and median values almost similar , it signifies that it is normally distribited and has no skewness/outlier
#circularity : it also seems to be normally distribted as mean amd median has similar values
#scatter_ratio feature seems to be having some kind of skewness and outlier
#Scaled variance 1 & 2
In [140]:
vehicle.dtypes
Out[140]:
compactness                      int64
circularity                    float64
distance_circularity           float64
radius_ratio                   float64
pr.axis_aspect_ratio           float64
max.length_aspect_ratio          int64
scatter_ratio                  float64
elongatedness                  float64
pr.axis_rectangularity         float64
max.length_rectangularity        int64
scaled_variance                float64
scaled_variance.1              float64
scaled_radius_of_gyration      float64
scaled_radius_of_gyration.1    float64
skewness_about                 float64
skewness_about.1               float64
skewness_about.2               float64
hollows_ratio                    int64
class                           object
dtype: object
In [141]:
# circularity, class, hollow_ratio,max.length_rectangularity, , max.length_aspect_ratio, compactness has no missing values rest all features are having some kind of missing values 
# All attributes are of numerical type expect class.
In [142]:
vehicle.shape
Out[142]:
(846, 19)
In [143]:
vehicle.isnull().any()
Out[143]:
compactness                    False
circularity                     True
distance_circularity            True
radius_ratio                    True
pr.axis_aspect_ratio            True
max.length_aspect_ratio        False
scatter_ratio                   True
elongatedness                   True
pr.axis_rectangularity          True
max.length_rectangularity      False
scaled_variance                 True
scaled_variance.1               True
scaled_radius_of_gyration       True
scaled_radius_of_gyration.1     True
skewness_about                  True
skewness_about.1                True
skewness_about.2                True
hollows_ratio                  False
class                          False
dtype: bool
In [144]:
vehicle.nunique()
Out[144]:
compactness                     44
circularity                     27
distance_circularity            63
radius_ratio                   134
pr.axis_aspect_ratio            37
max.length_aspect_ratio         21
scatter_ratio                  131
elongatedness                   35
pr.axis_rectangularity          13
max.length_rectangularity       66
scaled_variance                128
scaled_variance.1              422
scaled_radius_of_gyration      143
scaled_radius_of_gyration.1     39
skewness_about                  23
skewness_about.1                41
skewness_about.2                30
hollows_ratio                   31
class                            3
dtype: int64
In [145]:
vehicle.isnull().sum()
Out[145]:
compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
In [146]:
# Finding out the missing values.

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0)
vehicle[['circularity']] = imputer.fit_transform(vehicle[['circularity']])
vehicle[['distance_circularity']] = imputer.fit_transform(vehicle[['distance_circularity']])
vehicle[['radius_ratio']] = imputer.fit_transform(vehicle[['radius_ratio']])
vehicle[['pr.axis_aspect_ratio']] = imputer.fit_transform(vehicle[['pr.axis_aspect_ratio']])
vehicle[['scatter_ratio']] = imputer.fit_transform(vehicle[['scatter_ratio']])
vehicle[['elongatedness']] = imputer.fit_transform(vehicle[['elongatedness']])
vehicle[['pr.axis_rectangularity']] = imputer.fit_transform(vehicle[['pr.axis_rectangularity']])
vehicle[['scaled_variance']] = imputer.fit_transform(vehicle[['scaled_variance']])
vehicle[['scaled_variance.1']] = imputer.fit_transform(vehicle[['scaled_variance.1']])
vehicle[['scaled_radius_of_gyration']] = imputer.fit_transform(vehicle[['scaled_radius_of_gyration']])
vehicle[['scaled_radius_of_gyration.1']] = imputer.fit_transform(vehicle[['scaled_radius_of_gyration.1']])
vehicle[['skewness_about']] = imputer.fit_transform(vehicle[['skewness_about']])
vehicle[['skewness_about.1']] = imputer.fit_transform(vehicle[['skewness_about.1']])
vehicle[['skewness_about.2']] = imputer.fit_transform(vehicle[['skewness_about.2']])
In [147]:
vehicle.isnull().sum()
Out[147]:
compactness                    0
circularity                    0
distance_circularity           0
radius_ratio                   0
pr.axis_aspect_ratio           0
max.length_aspect_ratio        0
scatter_ratio                  0
elongatedness                  0
pr.axis_rectangularity         0
max.length_rectangularity      0
scaled_variance                0
scaled_variance.1              0
scaled_radius_of_gyration      0
scaled_radius_of_gyration.1    0
skewness_about                 0
skewness_about.1               0
skewness_about.2               0
hollows_ratio                  0
class                          0
dtype: int64
In [148]:
#If you carefully observe above, our orginal dataframe vehicle and dataframe after finding the missing values , 
#we will find that , After we imputed the datfarme series , using simpleimputer, 
#we can see that the missing NaN values from our orginal vehdf datframe columns are treated and replaced using mode strategy.
In [149]:
#Summary View of all attribute to trace out outliers

plt.figure(figsize=(20,20))
ax = sns.boxplot(data=vehicle, orient="h")
In [150]:
#pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,
#scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1 are some of the attributes with outliers. 
#which is visible with all dotted points
In [151]:
#Treating Outliers.

Q1 = vehicle.quantile(0.25)
Q3 = vehicle.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
compactness                     13.00
circularity                      9.00
distance_circularity            28.00
radius_ratio                    54.00
pr.axis_aspect_ratio             8.00
max.length_aspect_ratio          3.00
scatter_ratio                   51.00
elongatedness                   13.00
pr.axis_rectangularity           4.00
max.length_rectangularity       22.00
scaled_variance                 50.00
scaled_variance.1              268.50
scaled_radius_of_gyration       49.00
scaled_radius_of_gyration.1      8.00
skewness_about                   7.00
skewness_about.1                14.00
skewness_about.2                 9.00
hollows_ratio                   10.75
dtype: float64
In [152]:
vehicle_clean = vehicle[~((vehicle < (Q1 - 1.5 * IQR)) |(vehicle > (Q3 + 1.5 * IQR))).any(axis=1)]
vehicle_clean.shape
Out[152]:
(813, 19)
In [153]:
#We can see that all out boxplot for all the attributes which had outlier have been treate and removed. Since no. of outliers were less we opted to remove it. 
#Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliers
In [154]:
# Let's Drop Class column and see the correlation Matrix & 
# Pairplot Before using this dataframe for PCA as PCA should only be perfromed on independent attribute
In [155]:
vehicle_clean= vehicle.drop('class', axis=1)
In [156]:
corr = vehicle.corr()
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(vehicle.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(vehicle.columns)
ax.set_yticklabels(vehicle.columns)
plt.show()
In [169]:
covMatrix = np.cov(vehicle,rowvar=False)
print(covMatrix)
[[ 6.78065662e+01  3.46204941e+01  1.02386510e+02  1.89733201e+02
   5.95205839e+00  5.61695413e+00 -1.53790566e+01  9.51583263e+00
   1.15742922e+01  2.23917272e+01 -1.95394967e-01]
 [ 3.46204941e+01  3.76252587e+01  7.65686612e+01  1.27223538e+02
   7.41481040e+00  7.08999289e+00  2.41158675e+00  4.34273015e+00
  -6.49222889e-01  2.06782289e+00 -6.88246593e-01]
 [ 1.02386510e+02  7.65686612e+01  2.47776010e+02  4.03300589e+02
   1.96525919e+01  1.91606656e+01 -2.65512088e+01  8.78410059e+00
   3.73311445e+01  3.88861727e+01 -7.17304530e-01]
 [ 1.89733201e+02  1.27223538e+02  4.03300589e+02  1.11562542e+03
   1.74695555e+02  6.91638941e+01 -4.51060205e+01  7.97881580e+00
   5.18536802e+01  1.17091234e+02 -4.27457312e+00]
 [ 5.95205839e+00  7.41481040e+00  1.96525919e+01  1.74695555e+02
   6.21277924e+01  2.35267730e+01  8.99351596e+00 -2.26012362e+00
  -2.26199692e+00  1.56975883e+01 -5.44117895e-01]
 [ 5.61695413e+00  7.08999289e+00  1.91606656e+01  6.91638941e+01
   2.35267730e+01  2.11711948e+01  1.01570886e+01  3.48309383e-01
   1.78706348e+00  4.92598095e+00  6.70745730e-01]
 [-1.53790566e+01  2.41158675e+00 -2.65512088e+01 -4.51060205e+01
   8.99351596e+00  1.01570886e+01  5.57777523e+01 -3.23359639e+00
  -8.43188365e+00 -4.45899380e+01 -1.11590606e+00]
 [ 9.51583263e+00  4.34273015e+00  8.78410059e+00  7.97881580e+00
  -2.26012362e+00  3.48309383e-01 -3.23359639e+00  2.40408622e+01
  -1.53362691e+00  3.53319527e+00  4.10016906e-01]
 [ 1.15742922e+01 -6.49222889e-01  3.73311445e+01  5.18536802e+01
  -2.26199692e+00  1.78706348e+00 -8.43188365e+00 -1.53362691e+00
   7.97590477e+01  1.36250215e+01 -6.69290291e-02]
 [ 2.23917272e+01  2.06782289e+00  3.88861727e+01  1.17091234e+02
   1.56975883e+01  4.92598095e+00 -4.45899380e+01  3.53319527e+00
   1.36250215e+01  5.53357072e+01  1.23197085e+00]
 [-1.95394967e-01 -6.88246593e-01 -7.17304530e-01 -4.27457312e+00
  -5.44117895e-01  6.70745730e-01 -1.11590606e+00  4.10016906e-01
  -6.69290291e-02  1.23197085e+00  4.92986137e-01]]
In [157]:
# Strong/fare Correlation:
#      - Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98
#      - skewness_about_2 and hollow_ratio seems to be strongly correlated, corr coeff: 0.89
#      - ditance_circularity and radius_ratio seems to have high positive correlation with corr coeff: 0.81
#      - compactness & circularity , radius_ratio & pr.axis_aspect_ratio also seems ver averagely correlated with coeff: 0.67.
#      - scaled _variance and scaled_radius_of_gyration, circularity & distance_circularity also seems to be highly correlated with corr coeff: 0.79
#      - pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with coeff: 0.81 
#      - scatter_ratio and elongatedness seems to be have strong negative correlation val : 0.97
#      - elongatedness and pr.axis_rectangularity seems to have strong negative correlation, val:  0.95

# Little To No Correlation:
#      -max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.5
#      - pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation
#      - scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated
#      - scaled_radius_gyration.1 & skewness_about seems to be very little correlated
#      - skewness_about & skewness_about.1 not be correlated
#      - skewness_about.1 and skewness_about.2 are not correlated.
In [158]:
sns.pairplot(vehicle.iloc[:,:])
plt.show()
In [159]:
#As observed in our correlation heatmap our pairplot seems to validate the same. Scaled Variance & Scaled Variance.1 seems to be have very strong positive correlation with value of 0.98. 
#skewness_about_2 and hollow_ratio also seems to have strong positive correation with coeff: 0.89

#scatter_ratio and elongatedness seems to be have very strong negative correlation. elongatedness and pr.axis_rectangularity seems to have strong negative correlation with val of

#We found from our pairplot analysis that, Scaled Variance & Scaled Variance.1 and elongatedness and pr.axis_rectangularity to be strongly correlated , 
#so they need to dropped of treated carefully before we go for model building.

#Most of the data attributes seems to be normally distributed
#scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skwed .
#pr.axis_rectangularity seems to be haing outliers as there are some gaps found in the plot.
In [160]:
# From above correlation matrix we can see that there are many features which are highly correlated. 
# if we carefully analyse, we will find that many features are there which having more than 0.9 correlation. 
# so we can decide to get rid of those columns whose correlation is +-0.9 or above.There are 8 such columns:

# 1. max.length_rectangularity
# 2. scaled_radius_of_gyration
# 3. skewness_about.2
# 4. scatter_ratio
# 5. elongatedness
# 6. pr.axis_rectangularity
# 7. scaled_variance
# 8. scaled_variance.1
In [161]:
vehicle = vehicle.drop(['max.length_rectangularity', 'scaled_radius_of_gyration', 'skewness_about.2', 'scatter_ratio', 'elongatedness', 'pr.axis_rectangularity', 'scaled_variance', 'scaled_variance.1'], axis=1)
In [162]:
for feature in vehicle.columns: # Loop through all columns in the dataframe
    if vehicle[feature].dtype == 'object': # Only apply for columns with categorical strings
        vehicle[feature] = pd.Categorical(vehicle[feature]).codes # Replace strings with an integer
In [163]:
vehicle.groupby("class").count()
Out[163]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scaled_radius_of_gyration.1 skewness_about skewness_about.1 hollows_ratio
class
0 218 218 218 218 218 218 218 218 218 218
1 429 429 429 429 429 429 429 429 429 429
2 199 199 199 199 199 199 199 199 199 199
In [164]:
# Spliting the data into Train and test

X = vehicle.drop("class" , axis=1)
y = vehicle["class"]   
test_size = 0.30 # taking 70:30 training and test set
seed = 100  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
In [165]:
from sklearn.svm import SVC

# Building a Support Vector Machine on train data
svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
svc_model.fit(X_train, y_train)

y_pred = svc_model.predict(X_test)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
[[  0  72   0]
 [  0 119   0]
 [  0  63   0]]
Accuracy: 0.468503937007874
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        72
           1       0.47      1.00      0.64       119
           2       0.00      0.00      0.00        63

    accuracy                           0.47       254
   macro avg       0.16      0.33      0.21       254
weighted avg       0.22      0.47      0.30       254

In [175]:
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits=10) # Define the split - into 2 folds 
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
print(kf) 
KFold(n_splits=10, random_state=None, shuffle=False)
In [178]:
from scipy.stats import zscore
vehicle=X.apply(zscore)
vehicle.head()
Out[178]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scaled_radius_of_gyration.1 skewness_about skewness_about.1 hollows_ratio
0 0.160580 0.517302 0.056545 0.272965 1.310206 0.311542 -0.327938 -0.074340 0.380665 0.183957
1 -0.325470 -0.624564 0.120112 -0.835442 -0.593962 0.094079 -0.059987 0.537873 0.156589 0.452977
2 1.254193 0.843549 1.518571 1.201630 0.548539 0.311542 0.073989 1.558230 -0.403603 0.049447
3 -0.082445 -0.624564 -0.007021 -0.296217 0.167705 0.094079 -1.265769 -0.074340 -0.291565 1.529056
4 -1.054545 -0.135193 -0.769817 1.081803 5.245485 9.444962 7.308682 0.537873 -0.179527 -1.699181
In [179]:
covMatrix = np.cov(vehicle,rowvar=False)
print(covMatrix)
[[ 1.00118343  0.68623251  0.79084412  0.69065619  0.09181254  0.14842463
  -0.25036693  0.23596607  0.15757316  0.36598446]
 [ 0.68623251  1.00118343  0.79395399  0.62170187  0.15354334  0.25150523
   0.05270433  0.14456452 -0.01186527  0.04537164]
 [ 0.79084412  0.79395399  1.00118343  0.76798667  0.15858456  0.26486339
  -0.2261194   0.1139479   0.26586756  0.3324884 ]
 [ 0.69065619  0.62170187  0.76798667  1.00118343  0.6643445   0.45056889
  -0.1810333   0.04877731  0.17403799  0.47181974]
 [ 0.09181254  0.15354334  0.15858456  0.6643445   1.00118343  0.64947191
   0.15295712 -0.05855013 -0.0321716   0.26804079]
 [ 0.14842463  0.25150523  0.26486339  0.45056889  0.64947191  1.00118343
   0.29592367  0.01545721  0.04354026  0.14408905]
 [-0.25036693  0.05270433 -0.2261194  -0.1810333   0.15295712  0.29592367
   1.00118343 -0.08840848 -0.12656621 -0.8035581 ]
 [ 0.23596607  0.14456452  0.1139479   0.04877731 -0.05855013  0.01545721
  -0.08840848  1.00118343 -0.03506456  0.09698477]
 [ 0.15757316 -0.01186527  0.26586756  0.17403799 -0.0321716   0.04354026
  -0.12656621 -0.03506456  1.00118343  0.20533271]
 [ 0.36598446  0.04537164  0.3324884   0.47181974  0.26804079  0.14408905
  -0.8035581   0.09698477  0.20533271  1.00118343]]
In [183]:
pca = PCA(n_components=10)
pca.fit(vehicle)
Out[183]:
PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [184]:
print(pca.explained_variance_)
[3.80271734 1.93672919 1.47560084 1.03822744 0.87869187 0.39203777
 0.25023321 0.1341308  0.06758726 0.03587861]
In [185]:
print(pca.components_)
[[ 0.42459576  0.37899246  0.45520672  0.47610521  0.24340374  0.23375702
  -0.15096281  0.09147698  0.12256951  0.28657462]
 [-0.12287957  0.13389163 -0.04868643  0.11360037  0.40469712  0.46217396
   0.61377529 -0.12017786 -0.17204043 -0.39185835]
 [-0.2862096  -0.43361015 -0.25048985  0.11689054  0.47607586  0.285319
  -0.25647032 -0.23699764  0.09833948  0.46258732]
 [ 0.00982717 -0.00233793 -0.16864527 -0.02037574  0.14204046  0.06337609
  -0.12575983  0.64875133 -0.69962143  0.14382479]
 [-0.04150325 -0.18300893 -0.05761908 -0.09419592 -0.0037019   0.2249679
   0.17117875  0.68485649  0.63467554 -0.02871414]
 [-0.02057196  0.09458633  0.12842031 -0.38670087 -0.47714126  0.72315159
  -0.10705018 -0.12353691 -0.10820912  0.18856976]
 [ 0.78952372 -0.49887307 -0.22478817  0.01289115 -0.08531413  0.07516654
   0.21457223 -0.10769572 -0.07871061  0.01784858]
 [-0.18903679 -0.59243612  0.63090969  0.24537194 -0.11515002  0.06553834
  -0.10822469  0.06135304 -0.1643571  -0.30819681]
 [-0.1535513  -0.08562185  0.29741703 -0.06775349 -0.10104574 -0.22981279
   0.64078609  0.00882983 -0.09086826  0.62775592]
 [-0.19442275  0.02810132 -0.37733897  0.72316366 -0.52127285  0.10005717
   0.09886869  0.02031665 -0.01581446  0.06233   ]]
In [186]:
print(pca.explained_variance_ratio_)
[0.37982224 0.19344399 0.14738566 0.10370002 0.08776532 0.03915744
 0.02499374 0.01339723 0.00675074 0.00358362]
In [202]:
plt.bar(list(range(0,10)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
In [201]:
plt.step(list(range(0,10)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
In [256]:
# Now 8 dimensions seems very reasonable. With 8 variables we can explain over 95% of the variation in the original data!
pca3 = PCA(n_components=8)
pca3.fit(vehicle)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(vehicle)
[[ 0.42459576  0.37899246  0.45520672  0.47610521  0.24340374  0.23375702
  -0.15096281  0.09147698  0.12256951  0.28657462]
 [-0.12287957  0.13389163 -0.04868643  0.11360037  0.40469712  0.46217396
   0.61377529 -0.12017786 -0.17204043 -0.39185835]
 [-0.2862096  -0.43361015 -0.25048985  0.11689054  0.47607586  0.285319
  -0.25647032 -0.23699764  0.09833948  0.46258732]
 [ 0.00982717 -0.00233793 -0.16864527 -0.02037574  0.14204046  0.06337609
  -0.12575983  0.64875133 -0.69962143  0.14382479]
 [-0.04150325 -0.18300893 -0.05761908 -0.09419592 -0.0037019   0.2249679
   0.17117875  0.68485649  0.63467554 -0.02871414]
 [-0.02057196  0.09458633  0.12842031 -0.38670087 -0.47714126  0.72315159
  -0.10705018 -0.12353691 -0.10820912  0.18856976]
 [ 0.78952372 -0.49887307 -0.22478817  0.01289115 -0.08531413  0.07516654
   0.21457223 -0.10769572 -0.07871061  0.01784858]
 [-0.18903679 -0.59243612  0.63090969  0.24537194 -0.11515002  0.06553834
  -0.10822469  0.06135304 -0.1643571  -0.30819681]]
[0.37982224 0.19344399 0.14738566 0.10370002 0.08776532 0.03915744
 0.02499374 0.01339723]
In [257]:
Xpca3
Out[257]:
array([[ 9.53750669e-01,  4.22087773e-01,  6.84377687e-01, ...,
        -4.14742138e-01, -3.17878795e-01, -4.52953340e-01],
       [-6.33298164e-01, -6.47179549e-01,  9.31498865e-02, ...,
         6.45996640e-01, -4.50725491e-04,  2.51035773e-01],
       [ 2.41800502e+00,  2.95587506e-01, -1.01971534e+00, ...,
        -3.99529070e-01,  1.00855693e-01,  6.12033890e-01],
       ...,
       [ 2.92118712e+00,  5.21099755e-01, -1.71422149e-01, ...,
         2.27501984e-01,  2.77152597e-01, -1.20978897e-01],
       [-1.42323876e+00, -1.06963313e+00,  1.18383830e+00, ...,
         1.77489360e-01, -1.10397348e-01,  5.39670757e-01],
       [-2.75623925e+00, -5.01406489e-01,  3.46624913e-01, ...,
         4.81034819e-02,  1.85030748e-01,  1.74872334e-01]])
In [258]:
sns.pairplot(pd.DataFrame(Xpca3))
plt.show()
In [259]:
Xpca3_train, Xpca3_test, y_train, y_test = train_test_split(Xpca3, y, test_size=0.3, random_state=100)
In [260]:
# Building a Support Vector Machine on PCA train data
Xpca3_svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
Xpca3_svc_model.fit(Xpca3_train, y_train)

Xpca3_y_pred = Xpca3_svc_model.predict(Xpca3_test)

cnf_matrix = metrics.confusion_matrix(y_test, Xpca3_y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_y_pred))
print(metrics.classification_report(y_test, Xpca3_y_pred))
[[ 58  13   1]
 [  4 109   6]
 [  0  17  46]]
Accuracy: 0.8385826771653543
              precision    recall  f1-score   support

           0       0.94      0.81      0.87        72
           1       0.78      0.92      0.84       119
           2       0.87      0.73      0.79        63

    accuracy                           0.84       254
   macro avg       0.86      0.82      0.83       254
weighted avg       0.85      0.84      0.84       254

In [261]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_y_pred))
Accuracy: 0.468503937007874
Accuracy: 0.8385826771653543
In [262]:
# We can see that after appling PCA and reducing the dimensions we were able to increase the accuracy of our model to 84%.
In [277]:
# Now 4 dimensions seems very reasonable. With 4 variables we can explain over 95% of the variation in the original data!
pca3_1 = PCA(n_components=4)
pca3_1.fit(vehicle)
print(pca3_1.components_)
print(pca3_1.explained_variance_ratio_)
Xpca3_1 = pca3_1.transform(vehicle)
[[ 0.42459576  0.37899246  0.45520672  0.47610521  0.24340374  0.23375702
  -0.15096281  0.09147698  0.12256951  0.28657462]
 [-0.12287957  0.13389163 -0.04868643  0.11360037  0.40469712  0.46217396
   0.61377529 -0.12017786 -0.17204043 -0.39185835]
 [-0.2862096  -0.43361015 -0.25048985  0.11689054  0.47607586  0.285319
  -0.25647032 -0.23699764  0.09833948  0.46258732]
 [ 0.00982717 -0.00233793 -0.16864527 -0.02037574  0.14204046  0.06337609
  -0.12575983  0.64875133 -0.69962143  0.14382479]]
[0.37982224 0.19344399 0.14738566 0.10370002]
In [278]:
Xpca3_1
Out[278]:
array([[ 0.95375067,  0.42208777,  0.68437769, -0.05573369],
       [-0.63329816, -0.64717955,  0.09314989,  0.22871068],
       [ 2.41800502,  0.29558751, -1.01971534,  1.11850812],
       ...,
       [ 2.92118712,  0.52109976, -0.17142215,  0.29352311],
       [-1.42323876, -1.06963313,  1.1838383 , -1.75365986],
       [-2.75623925, -0.50140649,  0.34662491, -1.22741852]])
In [279]:
sns.pairplot(pd.DataFrame(Xpca3_1))
plt.show()
In [280]:
Xpca3_1_train, Xpca3_1_test, y_train, y_test = train_test_split(Xpca3_1, y, test_size=0.3, random_state=100)
In [281]:
# Building a Support Vector Machine on PCA train data
Xpca3_1_svc_model = SVC(C= 1, kernel='rbf', gamma= 1)
Xpca3_1_svc_model.fit(Xpca3_1_train, y_train)

Xpca3_1_y_pred = Xpca3_1_svc_model.predict(Xpca3_1_test)

cnf_matrix = metrics.confusion_matrix(y_test, Xpca3_1_y_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_1_y_pred))
print(metrics.classification_report(y_test, Xpca3_1_y_pred))
[[ 55  11   6]
 [  5 105   9]
 [  7  19  37]]
Accuracy: 0.7755905511811023
              precision    recall  f1-score   support

           0       0.82      0.76      0.79        72
           1       0.78      0.88      0.83       119
           2       0.71      0.59      0.64        63

    accuracy                           0.78       254
   macro avg       0.77      0.74      0.75       254
weighted avg       0.77      0.78      0.77       254

In [284]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, Xpca3_1_y_pred))
Accuracy: 0.468503937007874
Accuracy: 0.7755905511811023
In [285]:
# if we reducing the dimensions to 4 we were able to increase the accuracy of our model to 77.5%.
In [ ]: